This document is an adaptation of the “Use OpenAI text embeddings for horror movie descriptions” by Julia Silge.
The goal is to apply my training in R towards real world projects, including the use of OpenAI towards a data science project. The data used here is from the TidyTuesday project.
#load r packages
library(tidytuesdayR)
library(tidymodels)
library(httr)
#get horror movie data from GitHub using tidytuesdayR package
tuesdata <- tt_load("2022-11-01")
horror_movies <- tuesdata$horror_movies
#create a sample of the horror movie data by filtering movies that are in the English language and remove movies that do not have an overview
#use slice sample to randomly select rows
set.seed(123)
horror_movies_df <- horror_movies %>%
filter(!is.na(overview), original_language == "en") %>%
slice_sample(n = 1000)
glimpse(horror_movies_df)
## Rows: 1,000
## Columns: 20
## $ id <dbl> 751453, 753328, 696605, 46020, 217787, 698676, 14229…
## $ original_title <chr> "Sushi Night", "Spout", "What Josiah Saw", "Sharktop…
## $ title <chr> "Sushi Night", "Spout", "What Josiah Saw", "Sharktop…
## $ original_language <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en"…
## $ overview <chr> "After having a dinner date, a man realizes his love…
## $ tagline <chr> NA, NA, "You do what need be done then.", "Half-shar…
## $ release_date <date> 2020-10-08, 2009-11-21, 2021-08-13, 2010-09-25, 201…
## $ poster_path <chr> "/s43doT1jZ1yrTibqddL4l2ekHaJ.jpg", "/1WXajyutGGPlms…
## $ popularity <dbl> 0.600, 0.600, 5.622, 8.925, 4.859, 0.871, 3.221, 2.0…
## $ vote_count <dbl> 0, 0, 23, 138, 46, 0, 47, 6, 4, 45, 0, 0, 1, 5, 4, 7…
## $ vote_average <dbl> 0.0, 0.0, 6.0, 4.5, 3.9, 0.0, 5.1, 4.5, 7.9, 4.3, 0.…
## $ budget <dbl> 0, 0, 0, 0, 3600000, 0, 0, 0, 0, 0, 0, 0, 0, 4000, 0…
## $ revenue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30000, …
## $ runtime <dbl> 9, 17, 120, 89, 89, 84, 93, 74, 112, 88, 9, 75, 85, …
## $ status <chr> "Released", "Released", "Released", "Released", "Rel…
## $ adult <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ backdrop_path <chr> NA, NA, "/d3rvdCFRHydPhb9bxnMBUFMEA9I.jpg", "/lHVxlW…
## $ genre_names <chr> "Horror, Thriller", "Drama, Horror", "Horror, Thrill…
## $ collection <dbl> NA, NA, NA, 370374, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ collection_name <chr> NA, NA, NA, "Sharktopus Collection", NA, NA, NA, NA,…
#check out 3 random samples of movie overviews
sample(horror_movies_df$overview, size = 3)
## [1] "Victor Reynolds arrives at the notorious House of Usher, whereupon he is greeted by old acquaintances Roderick and Madeline Usher and their servant, Markus. As Victor uncovers more about the history of the house and the disappearances of those that entered it previously, he begins to realize that he is in mortal danger."
## [2] "After having a feud with director Kenneth J. Hall, producer Fred Olen Ray hired Ted Newsom to shoot brand new footage (on video) to weave in with scenes from Hall's film Evil Spawn (1987)."
## [3] "A 16mm psychodrama about a young woman who, obsessed with transcribing her thoughts to a myriad of post-it notes, finds herself struggling to escape a surreal anxiety attack."
Set OpenAI API keys as envinroment variables using
Sys.setenv() function
#make API call to OpenAI
#text embeddings are representations of text learned from large datasets
embeddings_url <- "https://api.openai.com/v1/embeddings"
auth <- add_headers(Authorization = paste("Bearer",
Sys.getenv("OPENAI_API_KEY")))
body <- list(model = "text-embedding-ada-002", input = horror_movies_df$overview)
#call to OpenAI
resp <- POST(
embeddings_url,
auth,
body = body,
encode = "json"
)
Check resp$status_code to confirm API call
#200 = success
#401 = lack of valid auth credentials
resp$status_code
## [1] 200
#convert response from one large text chunk to a list using flatten
embeddings <- content(resp, as = "text", encoding = "UTF-8") %>%
jsonlite::fromJSON(flatten = TRUE)
#pluck data
#text is clustered by closeness
embed_extract <- embeddings %>%
pluck("data", "embedding")
#add extracted embeddings as a new column in the horror movie data
horror_embed <- horror_movies_df %>%
mutate(embeddings = embed_extract)
horror_embed %>%
select(id, original_title, embeddings)
## # A tibble: 1,000 × 3
## id original_title embeddings
## <dbl> <chr> <list>
## 1 751453 Sushi Night <dbl [1,536]>
## 2 753328 Spout <dbl [1,536]>
## 3 696605 What Josiah Saw <dbl [1,536]>
## 4 46020 Sharktopus <dbl [1,536]>
## 5 217787 Paranormal Whacktivity <dbl [1,536]>
## 6 698676 Dark Web: Mystery Box <dbl [1,536]>
## 7 14229 Ti piace Hitchcock? <dbl [1,536]>
## 8 364094 Fun Size Horror: Volume Two <dbl [1,536]>
## 9 476484 Before I Die <dbl [1,536]>
## 10 407626 Ozark Sharks <dbl [1,536]>
## # … with 990 more rows
Create a matrix where every row is a movie and every column is an OpenAI embedding
embeddings_mat <- matrix(
unlist(horror_embed$embeddings),
ncol = 1536,
byrow = TRUE
)
#compute a cosine similarity matrix
embeddings_sim <- embeddings_mat / sqrt(rowSums(embeddings_mat * embeddings_mat))
embeddings_sim <- embeddings_sim %*% t(embeddings_mat)
dim(embeddings_sim)
## [1] 1000 1000
horror_movies_df %>%
slice(4) %>%
select(title, overview)
## # A tibble: 1 × 2
## title overview
## <chr> <chr>
## 1 Sharktopus "The U.S. Navy's special group \"Blue Water\" builds a half-shark,…
#check out
enframe(embeddings_sim[4, ], name = "movie", value = "similarity") %>%
arrange(-similarity)
## # A tibble: 1,000 × 2
## movie similarity
## <int> <dbl>
## 1 4 1.00
## 2 935 0.857
## 3 379 0.849
## 4 380 0.847
## 5 533 0.841
## 6 898 0.840
## 7 605 0.837
## 8 914 0.826
## 9 745 0.825
## 10 849 0.825
## # … with 990 more rows
horror_movies_df %>%
slice(c(935, 379, 380)) %>%
select(title, overview)
## # A tibble: 3 × 2
## title overview
## <chr> <chr>
## 1 Octaman "A scientific team in Mexico discover a pool of …
## 2 Dark Waters "Moneyless, ocean-exploring gigolo and his world…
## 3 Mega Shark vs. Giant Octopus "The California coast is terrorized by two enorm…
PC1 explains the most “variation” in the text
#identify first 32 principal components
#use prcopm_irlba for faster computation
set.seed(234)
horror_pca <- irlba::prcomp_irlba(embeddings_mat, n = 32)
augmented_pca <- as_tibble(horror_pca$x) %>%
bind_cols(horror_movies_df)
Plot principal components
augmented_pca %>%
#PC1 vs PC2 colored by vote average
ggplot(aes(x = PC1, y = PC2, color = vote_average)) +
geom_point(size = 1.2, alpha = 0.8) +
scale_color_viridis_c()
We observe that vote is unrelated to text. Movie description does not influence ratings.
#make plot interactive
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
##
## config
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#assign previously constructed plot to variable p
p <- augmented_pca %>%
#PC1 vs PC2 colored by vote average
ggplot(aes(x = PC1, y = PC2, color = vote_average)) +
geom_point(size = 1.2, alpha = 0.8) +
scale_color_viridis_c()
#run in ggplotly function to explore
ggplotly(p, tooltip = "text")